
################################################################################
################################################################################
###                                                                          ###
### --- Filename: 1.2 Multiple imputation.R -------------------------------- ###
###                                                                          ###
### --- Description: This is the R script for the multiple imputation for -- ###
### --- the paper "Economic Inequality and Public Support for -------------- ###
### --- Redistribution in Europe: A Cross-Sectional and Longitudinal ------- ###
### --- Multilevel Analysis" submitted to the IJPOR ------------------------ ###
###                                                                          ###
### --- Author: Valentin Velev --------------------------------------------- ###
###                                                                          ###
### --- Date: 11.12.2023 --------------------------------------------------- ###
###
### --- Parts of this code are based on Rushani Wijesuriya's script for her  ###
### --- paper "Evaluation of approaches for accommodating interactions ----- ###
### --- and non-linear terms in multiple imputation of incomplete ---------- ###
### --- three-level data" published in the Biometrical Journal. ------------ ###
###                                                                          ###
################################################################################
################################################################################

################################################################################
### --- 1. Preparation ----------------------------------------------------- ###
################################################################################

# install and load packages
required_packages <- c(
  # packages for loading and wrangling data
  "tidyverse", "haven", "readxl", "sjmisc", "sjlabelled", "fastDummies",
  # packages for (multiple) imputation
  "naniar", "imputeTS", "jomo", "mitml",
  # packages for statistical models
  "lme4", "lmerTest", "broom.mixed",
  # packages for post-estimation
  "influence.ME", "easystats", "sjPlot", "jtools", "interplot", "interactions",
  # other packages
  "ggpubr", "qqplotr", "NCmisc", "rstatix"
)

for (p in required_packages){
  
  if(!require(p, character.only=T)) install.packages(p, dependencies=T)
  library(p, character.only=T)
  
}

rm(p, required_packages)

# set seed to ensure replicability
set.seed(2946)

# global options
options(digits=6, scipen=999, max.print=10000)

# load necessary environment objects
load("Environment/data_with_NA")

################################################################################
### --- 2. ML-MI using Jomo ------------------------------------------------ ###
################################################################################

# create df without missings < 5% using LWD and prepare data for MI
data_mi <- 
  data_with_NA[complete.cases(data_with_NA[,c(-4,-6)]),][,c(-16:-22)] %>%
  data.frame()

# check NA distribution
as.data.frame(naniar::miss_var_summary(data_mi))

as.data.frame(naniar::miss_var_summary(data_mi %>% group_by(year)))

# check how many observations were deleted
(nrow(data_with_NA)-nrow(data_mi)) # 27,171 obs
((nrow(data_with_NA)-nrow(data_mi))/nrow(data_with_NA)*100) # or 6.89%

################################################################################

# create country dummies
data_mi <- fastDummies::dummy_cols(data_mi, select_columns = "country")

# run MI (takes 16+ hours to complete)
imp_jomo <- jomo.lmer(formula=income ~
                        redist+lrscale+age+male+badhealth+unionmember+employment+
                        married+edulvl+gdp_pc+gini_disp+country_BE+country_BG+country_CH+
                        country_CY+country_CZ+country_DE+country_DK+country_EE+country_ES+
                        country_FI+country_FR+country_GR+country_HR+country_HU+country_IE+
                        country_IS+country_IT+country_LT+country_LV+country_NL+country_NO+
                        country_PL+country_PT+country_RO+country_SE+country_SI+country_SK+
                        country_UK+(1|country_year),
                      data=data_mi[,c(-1,-2)], level=c(2,rep(1, times=10),2,2,rep(2, times=28)),
                      nburn=5000, nbetween=1000, nimp=20, output=2)

#
save(data_with_NA, data_lwd, data_mi, macrodata, file="Environment/data.RData")
saveRDS(imp_jomo, "Environment/imp_jomo.RDS")

rm(list=ls())

readRDS("Environment/imp_jomo.RDS")

# turn imputed data into list
implist_jomo <- jomo2mitml.list(imp_jomo)

#
rm(imp_jomo)

################################################################################
### --- 3. Wrangle imputed data, run LMMs, and pool estimates -------------- ###
################################################################################

# re-code imputed data using function and lapply
## define functions
myfun1 <- function(x) {
  
  # main re-coding of imputed datasets
  x %>%
    dplyr::rename(country_year = clus) %>%
    mutate(country = stringr::str_extract(country_year, "[A-Z]{2}"),
           year = stringr::str_extract(country_year, "\\d+"), .before="income") %>%
    mutate(across(c(income, lrscale, age), function(x){x - mean(x, na.rm=T)}, .names="{col}_cgm")) %>%
    mutate(across(c(year, male, employment, edulvl, married, badhealth, unionmember), as.factor)) %>%
    de_mean(gini_disp, gdp_pc, grp=country, suffix.dm="_we", suffix.gm="_be") %>%
    dplyr::select(-contains("country_"), -id)
  
}

myfun2 <- function(x) {
  
  # create group mean centered income
  x %>%
    group_by(country) %>%
    mutate(income_cwc = income_cgm - mean(income_cgm)) %>%
    ungroup()
  
}

## apply both functions
implist_jomo_adj <- as.mitml.list(lapply(lapply(implist_jomo, myfun1), myfun2))

#
saveRDS(implist_jomo_adj, "Environment/implist_jomo_adj.RDS")

rm(list=ls())

readRDS("Environment/implist_jomo_adj.RDS")

################################################################################
### --- 4. Run LMMs with MI data ------------------------------------------- ###
################################################################################

# fit models
lmm_list_m0 <- with(
  implist_jomo_adj,
  lmer(redist ~
         1+(1|country)+(1|country:year),
       REML=T)
)

#
saveRDS(lmm_list_m0, "Environment/lmm_list_m0.RDS")

rm(lmm_list_m0)

#
lmm_list_m1 <- with(
  implist_jomo_adj,
  lmer(redist ~
         income_cgm+lrscale_cgm+age_cgm+I(age_cgm^2)+male+edulvl+employment+
         married+badhealth+unionmember+
         year+(1|country)+(1|country:year),
       REML=T, control=lmerControl(optimizer="bobyqa",
                                   optCtrl=list(maxfun=1e6),
                                   calc.derivs=F))
)

#
saveRDS(lmm_list_m1, "Environment/lmm_list_m1.RDS")

rm(lmm_list_m1)

#
lmm_list_m2 <- with(
  implist_jomo_adj,
  lmer(redist ~
         income_cgm+lrscale_cgm+age_cgm+I(age_cgm^2)+male+edulvl+employment+
         married+badhealth+unionmember+
         gini_disp_be+gini_disp_we+
         year+(1|country)+(1|country:year),
       REML=T, control=lmerControl(optimizer="bobyqa",
                                   optCtrl=list(maxfun=1e6),
                                   calc.derivs=F))
)

#
saveRDS(lmm_list_m2, "Environment/lmm_list_m2.RDS")

rm(lmm_list_m2)

#
lmm_list_m3 <- with(
  implist_jomo_adj,
  lmer(redist ~
         income_cgm+lrscale_cgm+age_cgm+I(age_cgm^2)+male+edulvl+employment+
         married+badhealth+unionmember+
         gini_disp_be+gini_disp_we+gdp_pc_be+gdp_pc_we+
         year+(1|country)+(1|country:year),
       REML=T, control=lmerControl(optimizer="bobyqa",
                                   optCtrl=list(maxfun=1e6),
                                   calc.derivs=F))
)

#
saveRDS(lmm_list_m3, "Environment/lmm_list_m3.RDS")

rm(lmm_list_m3)

#
lmm_list_m4 <- with(
  implist_jomo_adj,
  lmer(redist ~
         income_cgm+lrscale_cgm+age_cgm+I(age_cgm^2)+male+edulvl+employment+
         married+badhealth+unionmember+
         gini_disp_be+gini_disp_we+gdp_pc_be+gdp_pc_we+
         year+(1+income_cgm|country)+(1+income_cgm|country:year),
       REML=T, control=lmerControl(optimizer="bobyqa",
                                   optCtrl=list(maxfun=1e6), # if it doesnt work, try maxfun=2e5
                                   calc.derivs=F))
)

#
saveRDS(lmm_list_m4, "Environment/lmm_list_m4.RDS")

rm(lmm_list_m4)

#
lmm_list_m5 <- with(
  implist_jomo_adj,
  lmer(redist ~
         income_cwc+lrscale_cgm+age_cgm+I(age_cgm^2)+male+edulvl+employment+
         married+badhealth+unionmember+
         gini_disp_be+gini_disp_we+gdp_pc_be+gdp_pc_we+
         (gini_disp_be*income_cwc)+(gini_disp_we*income_cwc)+
         year+(1+income_cwc|country)+(1+income_cwc|country:year),
       REML=T, control=lmerControl(optimizer="bobyqa",
                                   optCtrl=list(maxfun=1e6),
                                   calc.derivs=F))
)

#
saveRDS(lmm_list_m5, "Environment/lmm_list_m5.RDS")

rm(lmm_list_m5)

################################################################################
### --- 5. Pool estimates -------------------------------------------------- ###
################################################################################

# load LMMs again
lmm_list_m0 <- readRDS("Environment/lmm_list_m0.RDS")
lmm_list_m1 <- readRDS("Environment/lmm_list_m1.RDS")
lmm_list_m2 <- readRDS("Environment/lmm_list_m2.RDS")
lmm_list_m3 <- readRDS("Environment/lmm_list_m3.RDS")
lmm_list_m4 <- readRDS("Environment/lmm_list_m4.RDS")
lmm_list_m5 <- readRDS("Environment/lmm_list_m5.RDS")

# pool estimates and place them in dataframes for later analysis
m0_mi <- bind_rows(as.data.frame(testEstimates(lmm_list_m0, extra.pars=T)[["estimates"]]),
                   as.data.frame(testEstimates(lmm_list_m0, extra.pars=T)[["extra.pars"]]))

m1_mi <- bind_rows(as.data.frame(testEstimates(lmm_list_m1, extra.pars=T)[["estimates"]]),
                   as.data.frame(testEstimates(lmm_list_m1, extra.pars=T)[["extra.pars"]]))

m2_mi <- bind_rows(as.data.frame(testEstimates(lmm_list_m2, extra.pars=T)[["estimates"]]),
                   as.data.frame(testEstimates(lmm_list_m2, extra.pars=T)[["extra.pars"]]))

m3_mi <- bind_rows(as.data.frame(testEstimates(lmm_list_m3, extra.pars=T)[["estimates"]]),
                   as.data.frame(testEstimates(lmm_list_m3, extra.pars=T)[["extra.pars"]]))

m4_mi <- bind_rows(as.data.frame(testEstimates(lmm_list_m4, extra.pars=T)[["estimates"]]),
                   as.data.frame(testEstimates(lmm_list_m4, extra.pars=T)[["extra.pars"]]))

m5_mi <- bind_rows(as.data.frame(testEstimates(lmm_list_m5, extra.pars=T)[["estimates"]]),
                   as.data.frame(testEstimates(lmm_list_m5, extra.pars=T)[["extra.pars"]]))

#
rm(list=ls(pattern="lmm_"))

################################################################################
### --- 6. Save ------------------------------------------------------------ ###
################################################################################

save(m0_mi, m1_mi, m2_mi, m3_mi, m4_mi, m5_mi, file="Environment/main_models_mi.RData")





